library(car)
library(ggplot2)
library(reshape2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
These are not meant to be canonical solutions for this homework question, but rather a sketch of possible approaches for the purpose of discussion in section.
dat <- read.csv("thatch-ant.dat.txt")
head(dat)
## Colony Distance Mass Headwidth Headwidth..mm. Size.class
## 1 1 1 109 45 1.895 >43
## 2 1 1 120 43 1.811 40-43
## 3 1 1 94 42 1.768 40-43
## 4 1 1 61 33 1.389 30-34
## 5 1 1 72 41 1.726 40-43
## 6 1 1 134 46 1.937 >43
dat <- dat[which(dat$Colony <= 6), ]
dat <- na.omit(dat)
dim(dat)
## [1] 648 6
n <- dim(dat)[1]
unique(dat$Colony)
## [1] 1 2 3 4 6 5
unique(dat$Distance)
## [1] 1 4 7 10 0
unique(dat$Headwidth)
## [1] 45 43 42 33 41 46 44 39 40 28 38 37 47 32 29 36 35 34 31 30 48 49 27
unique(dat$Headwidth..mm.)
## [1] 1.895 1.811 1.768 1.389 1.726 1.937 1.853 1.642 1.684 1.179 1.600
## [12] 1.558 1.979 1.347 1.221 1.516 1.474 1.432 1.305 1.263 2.021 2.063
## [23] 1.137
# plot(dat$Headwidth, dat$Headwidth..mm.)
levels(dat$Size.class)
## [1] "<30" "\x80" ">43" "30-34" "35-39" "40-43"
# re-order Size.class levels
dat$Size.class <- factor(dat$Size.class, levels(dat$Size.class)[c(1, 4, 5, 6, 3, 2)])
dat$Colony <- as.factor(dat$Colony)
dat$Distance <- as.factor(dat$Distance)
dat <- dat[, -5] # remove mm version of headwidth
p <- ggplot(dat)
p + geom_bar(aes(x=Colony)) + ggtitle("Histogram of colony size")
p + geom_boxplot(aes(x=Colony, y=Mass)) + ggtitle("Boxplots of mass by colony")
p + geom_boxplot(aes(x=Colony, y=Headwidth)) + ggtitle("Boxplots of head width by colony")
# p + geom_bar(aes(x=Colony, fill=Size.class), position="dodge") + ggtitle("Histograms of size class by colony")
dat.size.colony <- melt(dcast(dat, Size.class ~ Colony, fun.aggregate=length), id.vars="Size.class", variable.name="Colony", value.name="Count")
## Using Size.class as value column: use value.var to override.
ggplot(dat.size.colony, aes(x=Colony, y=Count, fill=Size.class)) + geom_bar(stat="identity", position=position_dodge(width=0.8), width=0.7) + geom_text(aes(label=Count), position=position_dodge(width=0.8), vjust=-0.5, size=2.5)
# p + geom_bar(aes(x=Colony, fill=Distance), position="dodge") + ggtitle("Histograms of distance by colony")
dat.distance.colony <- melt(dcast(dat, Distance ~ Colony, fun.aggregate=length), id.vars="Distance", variable.name="Colony", value.name="Count")
## Using Size.class as value column: use value.var to override.
ggplot(dat.distance.colony, aes(x=Colony, y=Count, fill=Distance)) + geom_bar(stat="identity", position=position_dodge(width=0.8), width=0.7) + geom_text(aes(label=Count), position=position_dodge(width=0.8), vjust=-0.5, size=2.5)
#coplot(Mass ~ Distance | Colony, data=dat, rows=1)
dat.tmp <- dat %>% group_by(Colony, Distance) %>% mutate(Mass.mean=mean(Mass)) %>% ungroup
ggplot(dat.tmp, aes(x=as.numeric(as.character(Distance)), y=Mass)) + geom_point(size=0.5) +
# geom_smooth(method='loess', se=F, method.args=list(degree=1)) +
geom_line(aes(y=Mass.mean), color='blue') +
facet_wrap(~Colony, nrow=1) + theme(aspect.ratio=1) +
ggtitle("Coplot of Mass ~ Distance | Colony") +
xlab("Distance")
p + geom_point(aes(x=Headwidth, y=Mass), size=0.5) + ggtitle("Mass vs. head width")
# p + geom_point(aes(x=Headwidth, y=Mass)) + facet_wrap(~Colony, nrow=1) + theme(aspect.ratio=1) + ggtitle("Coplot of Mass ~ Headwidth | Colony")
When you use lm with one categorical explanatory variable, what dummy variables are used in the model? What happens when you remove the intercept when calling lm? What is the answer to the above questions when there are more than one categorical explanatory variable? Does the order in which you list the explanatory variables matter?
lm. If we do lm(Mass ~ Colony + Distance + Headwidth + Size.class, data=dat), then we are implicitly saying that ants in colony \(c\), distance \(d\), and size class \(s\) follow the model \[y = \beta_{\text{intercept}} + \beta_{\text{colony $c$}} + \beta_{\text{distance $d$}} + \beta_{\text{size class $s$}} + \beta_{\text{headwidth}} \cdot \text{headwidth} + \epsilon,\] where \[\beta_{\text{colony $1$}} = \beta_{\text{distance $0$}} = \beta_{\text{size class $<30$}} = 0.\] (Remember the default behavior of lm for categorical variables. The only coefficients that appear are \(\beta_{\text{intercept}}, \beta_{\text{colony $2$}}, \ldots, \beta_{\text{colony $6$}}, \beta_{\text{distance $1$}}, \ldots, \beta_{\text{distance $10$}},\beta_{\text{size class $30-34$}}, \ldots, \beta_{\text{size class $>43$}}\).)lm. If you do lm(Mass ~ Colony + Distance + Headwidth + Size.class - 1, data=dat) as we do below, then we are implicitly saying that ants in colony \(c\), distance \(d\), and size class \(s\) follow the model \[y = \beta_{\text{colony $c$}} + \beta_{\text{distance $d$}} + \beta_{\text{size class $s$}} + \beta_{\text{headwidth}} \cdot \text{headwidth} + \epsilon,\] where \[\beta_{\text{distance $0$}} = \beta_{\text{size class $<30$}} = 0.\]The above sketch considers variations in how you (or lm) choose dummy variables to encode categorical variables. Note that these changes (intercept or no intercept, etc.) will not change the column space of the design matrix, so most things will remain the same (fitted values, residuals, RSS, etc.). However, the coefficients \(\widehat{\beta}\) will be different simply because the dummy variables represent different things depending on how you encode the categorical variables.
mods <- list()
mods <- c(mods, list(lm(Mass ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
# mods <- c(mods, list(lm(sqrt(Mass) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mods <- c(mods, list(lm(I(Mass^(0.4)) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mods <- c(mods, list(lm(log(Mass) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mod <- mods[[1]]
print(summary(mod))
##
## Call:
## lm(formula = Mass ~ Colony + Distance + Headwidth + Size.class -
## 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.804 -8.022 -0.555 8.003 51.511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 -98.8873 13.7297 -7.202 1.69e-12 ***
## Colony2 -99.5553 13.5927 -7.324 7.34e-13 ***
## Colony3 -94.7184 13.5871 -6.971 7.92e-12 ***
## Colony4 -99.7989 13.5455 -7.368 5.44e-13 ***
## Colony5 -95.2742 13.5730 -7.019 5.76e-12 ***
## Colony6 -99.2774 13.4595 -7.376 5.14e-13 ***
## Distance1 -6.6332 1.9680 -3.371 0.000796 ***
## Distance4 -9.5713 1.9502 -4.908 1.17e-06 ***
## Distance7 -9.4464 2.0165 -4.685 3.44e-06 ***
## Distance10 -10.5882 2.1188 -4.997 7.54e-07 ***
## Headwidth 5.0542 0.4555 11.097 < 2e-16 ***
## Size.class30-34 -2.2907 4.6612 -0.491 0.623278
## Size.class35-39 -6.4351 5.8475 -1.100 0.271539
## Size.class40-43 -4.3846 7.3629 -0.596 0.551722
## Size.class>43 -0.2594 8.8522 -0.029 0.976627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 633 degrees of freedom
## Multiple R-squared: 0.9812, Adjusted R-squared: 0.9808
## F-statistic: 2207 on 15 and 633 DF, p-value: < 2.2e-16
fit <- fitted(mod)
rstud <- rstudent(mod)
qplot(fit, rstud, size=I(0.5)) +
geom_hline(yintercept=0) +
xlab("Fitted values") +
ylab("Studentized residuals") +
geom_smooth(method='loess', se=F, method.args=list(degree=1)) +
ggtitle("Studentized residuals vs. fitted values")
qplot(log(fit), log(abs(rstud)), size=I(0.5)) +
geom_smooth(method='lm', se=F) + xlab("log fitted values") + ylab("log abs. stud. res.")
coef(lm(log(abs(rstud)) ~ log(fit)))
## (Intercept) log(fit)
## -3.5125407 0.6031174
for (mod in mods) {
fitted <- fitted(mod)
rstud <- rstudent(mod)
# print(summary(mod))
# dat.mod <- cbind(dat, Fitted=fitted(mod), Stud.Res=rstudent(mod))
# print(
# qplot(qt((1:n) / (n+1), df.residual(mod)), sort(dat.mod$Stud.Res)) + geom_abline(slope=1)
# + xlab("t-distribution quantiles")
# + ylab("Sorted studentized residuals")
# + ggtitle(sprintf("Q-Q plot for %s model", colnames(mod$model)[1]))
# )
print(
qplot(fitted, rstud, size=I(0.5))
+ geom_hline(yintercept=0)
+ xlab("Fitted values")
+ ylab("Studentized residuals")
+ geom_smooth(method='loess', se=F, method.args=list(degree=1))
+ ggtitle(sprintf("Studentized residuals vs. fitted values\nfor %s model", colnames(mod$model)[1]))
)
# print(ggplot(dat.mod, aes(x=log(Fitted), y=log(abs(Stud.Res)))) +
# geom_point(size=0.5) + xlab("log Fitted values") + ylab("log Studentized residuals") + geom_smooth(method='lm', se=F))
# b <- coef(lm(log(abs(Stud.Res))~log(Fitted), data=dat.mod))[2]
# print(b)
}
for (mod in mods) {
crPlot(mod, variable="Headwidth",
main=sprintf("CR plot for head width\nfor %s model", colnames(mod$model)[1]))
}
summary(mods[[3]])
##
## Call:
## lm(formula = log(Mass) ~ Colony + Distance + Headwidth + Size.class -
## 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61411 -0.08430 0.00382 0.09391 0.55504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 2.166692 0.144051 15.041 < 2e-16 ***
## Colony2 2.146691 0.142614 15.052 < 2e-16 ***
## Colony3 2.207843 0.142556 15.488 < 2e-16 ***
## Colony4 2.148735 0.142119 15.119 < 2e-16 ***
## Colony5 2.189606 0.142407 15.376 < 2e-16 ***
## Colony6 2.151806 0.141217 15.238 < 2e-16 ***
## Distance1 -0.072842 0.020648 -3.528 0.000450 ***
## Distance4 -0.096494 0.020461 -4.716 2.96e-06 ***
## Distance7 -0.087004 0.021157 -4.112 4.43e-05 ***
## Distance10 -0.100319 0.022231 -4.513 7.63e-06 ***
## Headwidth 0.053252 0.004779 11.143 < 2e-16 ***
## Size.class30-34 0.175366 0.048905 3.586 0.000362 ***
## Size.class35-39 0.250197 0.061352 4.078 5.12e-05 ***
## Size.class40-43 0.285233 0.077251 3.692 0.000241 ***
## Size.class>43 0.296502 0.092877 3.192 0.001481 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1445 on 633 degrees of freedom
## Multiple R-squared: 0.999, Adjusted R-squared: 0.999
## F-statistic: 4.233e+04 on 15 and 633 DF, p-value: < 2.2e-16
mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
mod.curr <- lm(log(Mass) ~ Distance + Headwidth + Size.class, data=dat, subset=(Colony == i))
mods.col <- c(mods.col, list(mod.curr))
print(summary(mod.curr))
coefs.dist <- coef(mod.curr)[2:5]
print(
qplot(
as.numeric(as.character(levels(dat$Distance))),
c(0, coefs.dist),
geom="line") +
xlab("Distance") + ylab("Offset from Distance zero") +
ggtitle(sprintf("Mass differences by distance class,\n Colony %d", i)) +
ylim(-0.2, 0.2)
)
}
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27804 -0.07940 -0.01233 0.05629 0.41261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47044 0.38905 6.350 6.22e-09 ***
## Distance1 -0.13998 0.04812 -2.909 0.004462 **
## Distance4 -0.14998 0.04518 -3.320 0.001255 **
## Distance7 -0.12899 0.04885 -2.641 0.009588 **
## Distance10 -0.17895 0.04863 -3.680 0.000377 ***
## Headwidth 0.04627 0.01310 3.532 0.000624 ***
## Size.class30-34 0.17610 0.10883 1.618 0.108739
## Size.class35-39 0.30630 0.15334 1.997 0.048463 *
## Size.class40-43 0.32731 0.19601 1.670 0.098039 .
## Size.class>43 0.32999 0.23959 1.377 0.171471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1303 on 101 degrees of freedom
## Multiple R-squared: 0.793, Adjusted R-squared: 0.7745
## F-statistic: 42.99 on 9 and 101 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62362 -0.07475 0.00107 0.11119 0.26351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.02183 0.38430 5.261 9.45e-07 ***
## Distance1 -0.02325 0.05272 -0.441 0.66029
## Distance4 -0.14630 0.05336 -2.742 0.00735 **
## Distance7 -0.02323 0.05373 -0.432 0.66656
## Distance10 -0.06167 0.05317 -1.160 0.24908
## Headwidth 0.06083 0.01211 5.024 2.51e-06 ***
## Size.class35-39 0.06192 0.08884 0.697 0.48755
## Size.class40-43 0.05515 0.13351 0.413 0.68052
## Size.class>43 0.08399 0.17381 0.483 0.63010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1452 on 91 degrees of freedom
## Multiple R-squared: 0.7985, Adjusted R-squared: 0.7808
## F-statistic: 45.07 on 8 and 91 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32130 -0.07811 -0.00152 0.07140 0.34807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.390280 0.293065 8.156 1.94e-12 ***
## Distance1 -0.145514 0.045102 -3.226 0.00175 **
## Distance4 -0.116551 0.045708 -2.550 0.01247 *
## Distance7 -0.124728 0.046152 -2.703 0.00823 **
## Distance10 -0.126132 0.045695 -2.760 0.00700 **
## Headwidth 0.046313 0.009595 4.827 5.64e-06 ***
## Size.class30-34 0.334938 0.102393 3.271 0.00152 **
## Size.class35-39 0.334920 0.124359 2.693 0.00844 **
## Size.class40-43 0.421685 0.153060 2.755 0.00710 **
## Size.class>43 0.464274 0.186393 2.491 0.01458 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.125 on 90 degrees of freedom
## Multiple R-squared: 0.8511, Adjusted R-squared: 0.8362
## F-statistic: 57.15 on 9 and 90 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49421 -0.08658 0.00371 0.08370 0.42056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.908707 0.419415 4.551 1.42e-05 ***
## Distance1 -0.014610 0.051806 -0.282 0.778
## Distance4 -0.008673 0.054136 -0.160 0.873
## Distance7 0.015396 0.054796 0.281 0.779
## Distance10 -0.050181 0.055344 -0.907 0.367
## Headwidth 0.062405 0.012720 4.906 3.34e-06 ***
## Size.class35-39 0.075239 0.091273 0.824 0.412
## Size.class40-43 0.071950 0.129130 0.557 0.579
## Size.class>43 0.092176 0.171368 0.538 0.592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1464 on 107 degrees of freedom
## Multiple R-squared: 0.7237, Adjusted R-squared: 0.7031
## F-statistic: 35.04 on 8 and 107 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47156 -0.09616 0.01064 0.11597 0.51390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20022 0.48538 4.533 1.96e-05 ***
## Distance1 -0.01119 0.06062 -0.185 0.85399
## Distance4 -0.05887 0.06192 -0.951 0.34453
## Distance7 -0.16309 0.06245 -2.611 0.01072 *
## Headwidth 0.04908 0.01620 3.030 0.00327 **
## Size.class30-34 0.23009 0.13493 1.705 0.09193 .
## Size.class35-39 0.38897 0.17548 2.217 0.02942 *
## Size.class40-43 0.45258 0.24163 1.873 0.06463 .
## Size.class>43 0.45434 0.29582 1.536 0.12842
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1706 on 82 degrees of freedom
## Multiple R-squared: 0.7758, Adjusted R-squared: 0.7539
## F-statistic: 35.46 on 8 and 82 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat,
## subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42929 -0.06602 0.00576 0.07460 0.44691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.140719 0.285638 7.495 1.25e-11 ***
## Distance1 -0.101796 0.049021 -2.077 0.0400 *
## Distance4 -0.108514 0.049340 -2.199 0.0298 *
## Distance7 -0.095709 0.050832 -1.883 0.0621 .
## Distance10 -0.114316 0.052651 -2.171 0.0319 *
## Headwidth 0.055457 0.009934 5.582 1.50e-07 ***
## Size.class30-34 0.096436 0.087659 1.100 0.2735
## Size.class35-39 0.207890 0.119752 1.736 0.0851 .
## Size.class40-43 0.223481 0.155390 1.438 0.1530
## Size.class>43 0.216806 0.190607 1.137 0.2576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1443 on 120 degrees of freedom
## Multiple R-squared: 0.8498, Adjusted R-squared: 0.8385
## F-statistic: 75.42 on 9 and 120 DF, p-value: < 2.2e-16
mod <- lm(Mass ~ . - 1, data=dat)
crPlot(mod, variable="Headwidth", main="CR plot for head width")
summary(mod)
##
## Call:
## lm(formula = Mass ~ . - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.804 -8.022 -0.555 8.003 51.511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 -98.8873 13.7297 -7.202 1.69e-12 ***
## Colony2 -99.5553 13.5927 -7.324 7.34e-13 ***
## Colony3 -94.7184 13.5871 -6.971 7.92e-12 ***
## Colony4 -99.7989 13.5455 -7.368 5.44e-13 ***
## Colony5 -95.2742 13.5730 -7.019 5.76e-12 ***
## Colony6 -99.2774 13.4595 -7.376 5.14e-13 ***
## Distance1 -6.6332 1.9680 -3.371 0.000796 ***
## Distance4 -9.5713 1.9502 -4.908 1.17e-06 ***
## Distance7 -9.4464 2.0165 -4.685 3.44e-06 ***
## Distance10 -10.5882 2.1188 -4.997 7.54e-07 ***
## Headwidth 5.0542 0.4555 11.097 < 2e-16 ***
## Size.class30-34 -2.2907 4.6612 -0.491 0.623278
## Size.class35-39 -6.4351 5.8475 -1.100 0.271539
## Size.class40-43 -4.3846 7.3629 -0.596 0.551722
## Size.class>43 -0.2594 8.8522 -0.029 0.976627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 633 degrees of freedom
## Multiple R-squared: 0.9812, Adjusted R-squared: 0.9808
## F-statistic: 2207 on 15 and 633 DF, p-value: < 2.2e-16
mod <- lm(Mass ~ . + I(Headwidth^2) - 1, data=dat)
crPlots(mod, terms=~Headwidth + I(Headwidth^2), main="CR plot for head width")
summary(mod)
##
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.397 -7.714 -0.633 7.835 50.652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 162.47770 76.03678 2.137 0.03300 *
## Colony2 161.47260 75.91767 2.127 0.03381 *
## Colony3 166.41567 75.94658 2.191 0.02880 *
## Colony4 161.66773 76.03296 2.126 0.03387 *
## Colony5 165.70549 75.90061 2.183 0.02939 *
## Colony6 161.76229 75.89766 2.131 0.03345 *
## Distance1 -6.42364 1.95175 -3.291 0.00105 **
## Distance4 -9.43454 1.93351 -4.879 1.35e-06 ***
## Distance7 -8.89666 2.00505 -4.437 1.08e-05 ***
## Distance10 -10.38091 2.10114 -4.941 9.98e-07 ***
## Headwidth -9.27095 4.12499 -2.248 0.02495 *
## Size.class30-34 11.74121 6.12200 1.918 0.05558 .
## Size.class35-39 17.69214 9.01601 1.962 0.05017 .
## Size.class40-43 20.63467 10.22500 2.018 0.04401 *
## Size.class>43 20.25507 10.55822 1.918 0.05551 .
## I(Headwidth^2) 0.17865 0.05113 3.494 0.00051 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.65 on 632 degrees of freedom
## Multiple R-squared: 0.9816, Adjusted R-squared: 0.9811
## F-statistic: 2106 on 16 and 632 DF, p-value: < 2.2e-16
mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
mod.curr <- lm(log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + Size.class, data=dat, subset=(Colony == i))
mods.col <- c(mods.col, list(mod.curr))
print(summary(mod.curr))
coefs.dist <- coef(mod.curr)[2:5]
print(
qplot(
as.numeric(as.character(levels(dat$Distance))),
c(0, coefs.dist),
geom="line") +
xlab("Distance") + ylab("Offset from Distance zero") +
ggtitle(sprintf("Mass differences by distance class,\n Colony %d", i)) +
ylim(-0.2, 0.2)
)
}
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27643 -0.07440 -0.01439 0.05836 0.41154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.198574 3.231483 -0.680 0.497848
## Distance1 -0.148450 0.048213 -3.079 0.002681 **
## Distance4 -0.149359 0.044933 -3.324 0.001241 **
## Distance7 -0.129252 0.048581 -2.661 0.009088 **
## Distance10 -0.178161 0.048365 -3.684 0.000373 ***
## Headwidth 0.295214 0.171555 1.721 0.088378 .
## I(Headwidth^2) -0.002970 0.002041 -1.455 0.148713
## Size.class30-34 -0.124578 0.233241 -0.534 0.594446
## Size.class35-39 -0.196618 0.377722 -0.521 0.603841
## Size.class40-43 -0.215971 0.421139 -0.513 0.609204
## Size.class>43 -0.172914 0.419748 -0.412 0.681260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1296 on 100 degrees of freedom
## Multiple R-squared: 0.7973, Adjusted R-squared: 0.777
## F-statistic: 39.33 on 10 and 100 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62810 -0.07403 0.00608 0.10124 0.26290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.230330 2.087562 2.026 0.0457 *
## Distance1 -0.017700 0.052927 -0.334 0.7388
## Distance4 -0.140699 0.053563 -2.627 0.0101 *
## Distance7 -0.014969 0.054232 -0.276 0.7832
## Distance10 -0.060122 0.053140 -1.131 0.2609
## Headwidth -0.055050 0.108342 -0.508 0.6126
## I(Headwidth^2) 0.001453 0.001350 1.076 0.2847
## Size.class35-39 0.144508 0.117328 1.232 0.2213
## Size.class40-43 0.146266 0.157990 0.926 0.3570
## Size.class>43 0.135779 0.180198 0.754 0.4531
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1451 on 90 degrees of freedom
## Multiple R-squared: 0.801, Adjusted R-squared: 0.7811
## F-statistic: 40.26 on 9 and 90 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32066 -0.07582 -0.00118 0.07037 0.34189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0690369 1.6679368 1.240 0.21806
## Distance1 -0.1444423 0.0456745 -3.162 0.00214 **
## Distance4 -0.1144380 0.0472054 -2.424 0.01736 *
## Distance7 -0.1254566 0.0465499 -2.695 0.00841 **
## Distance10 -0.1260310 0.0459439 -2.743 0.00736 **
## Headwidth 0.0636913 0.0893347 0.713 0.47774
## I(Headwidth^2) -0.0002179 0.0011137 -0.196 0.84531
## Size.class30-34 0.3233035 0.1188801 2.720 0.00786 **
## Size.class35-39 0.3108879 0.1752603 1.774 0.07951 .
## Size.class40-43 0.3965094 0.2005823 1.977 0.05116 .
## Size.class>43 0.4462862 0.2087287 2.138 0.03525 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1257 on 89 degrees of freedom
## Multiple R-squared: 0.8511, Adjusted R-squared: 0.8344
## F-statistic: 50.89 on 10 and 89 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49312 -0.08631 0.00431 0.08602 0.42549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.847859 2.578619 1.492 0.139
## Distance1 -0.014751 0.051908 -0.284 0.777
## Distance4 -0.006610 0.054310 -0.122 0.903
## Distance7 0.018057 0.055015 0.328 0.743
## Distance10 -0.052325 0.055524 -0.942 0.348
## Headwidth -0.038225 0.132639 -0.288 0.774
## I(Headwidth^2) 0.001255 0.001647 0.762 0.448
## Size.class35-39 0.142697 0.127265 1.121 0.265
## Size.class40-43 0.144954 0.160977 0.900 0.370
## Size.class>43 0.136957 0.181478 0.755 0.452
##
## Residual standard error: 0.1467 on 106 degrees of freedom
## Multiple R-squared: 0.7252, Adjusted R-squared: 0.7019
## F-statistic: 31.09 on 9 and 106 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47469 -0.09762 0.00590 0.11600 0.50259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.457246 3.130566 1.104 0.2727
## Distance1 -0.008092 0.061405 -0.132 0.8955
## Distance4 -0.057925 0.062280 -0.930 0.3551
## Distance7 -0.160488 0.063095 -2.544 0.0129 *
## Headwidth -0.019688 0.169964 -0.116 0.9081
## I(Headwidth^2) 0.000860 0.002116 0.407 0.6854
## Size.class30-34 0.285857 0.192912 1.482 0.1423
## Size.class35-39 0.495838 0.316591 1.566 0.1212
## Size.class40-43 0.564327 0.366824 1.538 0.1278
## Size.class>43 0.543271 0.369145 1.472 0.1450
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1714 on 81 degrees of freedom
## Multiple R-squared: 0.7762, Adjusted R-squared: 0.7514
## F-statistic: 31.22 on 9 and 81 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42879 -0.06685 0.00995 0.07635 0.43715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1763227 1.5421690 2.060 0.0416 *
## Distance1 -0.0992721 0.0492687 -2.015 0.0462 *
## Distance4 -0.1036463 0.0499599 -2.075 0.0402 *
## Distance7 -0.0908888 0.0514312 -1.767 0.0798 .
## Distance10 -0.1093305 0.0532700 -2.052 0.0423 *
## Headwidth -0.0022259 0.0849913 -0.026 0.9791
## I(Headwidth^2) 0.0007212 0.0010554 0.683 0.4957
## Size.class30-34 0.1624245 0.1305447 1.244 0.2159
## Size.class35-39 0.3149179 0.1973099 1.596 0.1131
## Size.class40-43 0.3350050 0.2255764 1.485 0.1402
## Size.class>43 0.3092764 0.2340980 1.321 0.1890
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1447 on 119 degrees of freedom
## Multiple R-squared: 0.8504, Adjusted R-squared: 0.8378
## F-statistic: 67.63 on 10 and 119 DF, p-value: < 2.2e-16
dat$Distance <- as.numeric(as.character(dat$Distance))
mod <- lm(Mass ~ . - 1, data=dat)
summary(mod)
##
## Call:
## lm(formula = Mass ~ . - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.338 -7.951 -1.057 7.556 50.135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 -107.3491 13.6517 -7.863 1.61e-14 ***
## Colony2 -107.6902 13.5237 -7.963 7.75e-15 ***
## Colony3 -102.8121 13.5194 -7.605 1.03e-13 ***
## Colony4 -108.0597 13.4688 -8.023 4.98e-15 ***
## Colony5 -103.8467 13.4831 -7.702 5.15e-14 ***
## Colony6 -107.7151 13.3733 -8.054 3.95e-15 ***
## Distance -0.7024 0.1629 -4.311 1.88e-05 ***
## Headwidth 5.1660 0.4582 11.275 < 2e-16 ***
## Size.class30-34 -2.3443 4.6846 -0.500 0.617
## Size.class35-39 -7.2879 5.8753 -1.240 0.215
## Size.class40-43 -5.4346 7.4005 -0.734 0.463
## Size.class>43 -1.6415 8.9006 -0.184 0.854
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.89 on 636 degrees of freedom
## Multiple R-squared: 0.9808, Adjusted R-squared: 0.9805
## F-statistic: 2711 on 12 and 636 DF, p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+Distance)
mod <- lm(Mass ~ . + I(Headwidth^2) - 1, data=dat)
summary(mod)
##
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.169 -7.493 -0.708 7.546 48.513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 156.8651 76.4647 2.051 0.040631 *
## Colony2 156.1822 76.3465 2.046 0.041197 *
## Colony3 161.1700 76.3766 2.110 0.035231 *
## Colony4 156.2623 76.4631 2.044 0.041402 *
## Colony5 160.0268 76.3398 2.096 0.036456 *
## Colony6 156.1846 76.3282 2.046 0.041146 *
## Distance -0.6770 0.1616 -4.188 3.21e-05 ***
## Headwidth -9.3103 4.1483 -2.244 0.025153 *
## Size.class30-34 11.9365 6.1731 1.934 0.053604 .
## Size.class35-39 17.2013 9.0869 1.893 0.058815 .
## Size.class40-43 19.9846 10.3069 1.939 0.052949 .
## Size.class>43 19.2501 10.6417 1.809 0.070933 .
## I(Headwidth^2) 0.1805 0.0514 3.511 0.000478 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 635 degrees of freedom
## Multiple R-squared: 0.9812, Adjusted R-squared: 0.9808
## F-statistic: 2548 on 13 and 635 DF, p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+I(Headwidth^2)+Distance, layout=c(2,2))
mod <- lm(Mass ~ . + I(Headwidth^2) + I(Distance^2) - 1, data=dat)
summary(mod)
##
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) + I(Distance^2) - 1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.248 -7.933 -0.541 7.646 50.063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Colony1 153.77536 76.18024 2.019 0.043952 *
## Colony2 152.81856 76.06449 2.009 0.044954 *
## Colony3 157.78987 76.09452 2.074 0.038519 *
## Colony4 152.92464 76.18039 2.007 0.045131 *
## Colony5 157.20031 76.05410 2.067 0.039144 *
## Colony6 153.00640 76.04486 2.012 0.044637 *
## Distance -2.01429 0.57232 -3.519 0.000463 ***
## Headwidth -9.00295 4.13423 -2.178 0.029799 *
## Size.class30-34 11.80885 6.14955 1.920 0.055271 .
## Size.class35-39 16.87972 9.05281 1.865 0.062702 .
## Size.class40-43 19.78927 10.26743 1.927 0.054378 .
## Size.class>43 19.41969 10.60082 1.832 0.067435 .
## I(Headwidth^2) 0.17598 0.05124 3.435 0.000632 ***
## I(Distance^2) 0.13516 0.05551 2.435 0.015169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.72 on 634 degrees of freedom
## Multiple R-squared: 0.9814, Adjusted R-squared: 0.981
## F-statistic: 2385 on 14 and 634 DF, p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+I(Headwidth^2)+Distance+I(Distance^2), layout=c(2,2))
mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
mod.curr <- lm(log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + Size.class, data=dat, subset=(Colony == i))
mods.col <- c(mods.col, list(mod.curr))
print(summary(mod.curr))
}
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31195 -0.08483 -0.00879 0.06696 0.38096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.552881 3.308195 -0.469 0.6398
## Distance -0.008266 0.003862 -2.140 0.0347 *
## Headwidth 0.251149 0.175496 1.431 0.1554
## I(Headwidth^2) -0.002364 0.002085 -1.134 0.2594
## Size.class30-34 -0.063692 0.240349 -0.265 0.7915
## Size.class35-39 -0.140551 0.389193 -0.361 0.7187
## Size.class40-43 -0.172844 0.434286 -0.398 0.6915
## Size.class>43 -0.171452 0.433246 -0.396 0.6931
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1341 on 103 degrees of freedom
## Multiple R-squared: 0.7766, Adjusted R-squared: 0.7614
## F-statistic: 51.15 on 7 and 103 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62049 -0.07120 0.01291 0.09140 0.31079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.421071 2.145981 2.060 0.0422 *
## Distance -0.002676 0.004307 -0.621 0.5358
## Headwidth -0.062732 0.111198 -0.564 0.5740
## I(Headwidth^2) 0.001500 0.001388 1.081 0.2826
## Size.class35-39 0.129235 0.120054 1.076 0.2845
## Size.class40-43 0.157579 0.160627 0.981 0.3291
## Size.class>43 0.151702 0.185315 0.819 0.4151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1515 on 93 degrees of freedom
## Multiple R-squared: 0.7759, Adjusted R-squared: 0.7615
## F-statistic: 53.68 on 6 and 93 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32610 -0.07117 -0.00882 0.08325 0.33967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3780533 1.6161729 0.853 0.3961
## Distance -0.0045702 0.0036937 -1.237 0.2191
## Headwidth 0.0973110 0.0867151 1.122 0.2647
## I(Headwidth^2) -0.0006388 0.0010825 -0.590 0.5566
## Size.class30-34 0.2795899 0.1193517 2.343 0.0213 *
## Size.class35-39 0.2438500 0.1728702 1.411 0.1617
## Size.class40-43 0.3249633 0.1978444 1.643 0.1039
## Size.class>43 0.3926782 0.2107608 1.863 0.0656 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1301 on 92 degrees of freedom
## Multiple R-squared: 0.8351, Adjusted R-squared: 0.8226
## F-statistic: 66.57 on 7 and 92 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46009 -0.08483 0.00447 0.08672 0.40491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4220579 2.5502471 1.342 0.182
## Distance -0.0027550 0.0041094 -0.670 0.504
## Headwidth -0.0130204 0.1308941 -0.099 0.921
## I(Headwidth^2) 0.0008912 0.0016204 0.550 0.583
## Size.class35-39 0.1299514 0.1260114 1.031 0.305
## Size.class40-43 0.1535099 0.1598058 0.961 0.339
## Size.class>43 0.1667161 0.1795379 0.929 0.355
##
## Residual standard error: 0.1462 on 109 degrees of freedom
## Multiple R-squared: 0.7193, Adjusted R-squared: 0.7039
## F-statistic: 46.55 on 6 and 109 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45141 -0.10306 0.00669 0.11302 0.51690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6075698 3.0739941 1.174 0.24392
## Distance -0.0235189 0.0069786 -3.370 0.00114 **
## Headwidth -0.0250366 0.1668124 -0.150 0.88106
## I(Headwidth^2) 0.0009065 0.0020741 0.437 0.66320
## Size.class30-34 0.2869502 0.1903189 1.508 0.13542
## Size.class35-39 0.4975452 0.3136175 1.586 0.11644
## Size.class40-43 0.5711825 0.3633973 1.572 0.11980
## Size.class>43 0.5533816 0.3654573 1.514 0.13377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1699 on 83 degrees of freedom
## Multiple R-squared: 0.7748, Adjusted R-squared: 0.7558
## F-statistic: 40.8 on 7 and 83 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) +
## Size.class, data = dat, subset = (Colony == i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43048 -0.07175 0.00922 0.07637 0.42193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3008183 1.5433877 2.139 0.0345 *
## Distance -0.0040700 0.0039060 -1.042 0.2995
## Headwidth -0.0140875 0.0849431 -0.166 0.8686
## I(Headwidth^2) 0.0008959 0.0010540 0.850 0.3970
## Size.class30-34 0.1591814 0.1310688 1.214 0.2269
## Size.class35-39 0.3116475 0.1979461 1.574 0.1180
## Size.class40-43 0.3363065 0.2262235 1.487 0.1397
## Size.class>43 0.2891843 0.2347295 1.232 0.2203
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1453 on 122 degrees of freedom
## Multiple R-squared: 0.8452, Adjusted R-squared: 0.8363
## F-statistic: 95.13 on 7 and 122 DF, p-value: < 2.2e-16
for (i in 1:length(unique(dat$Colony))) {
print(coef(mods.col[[i]])[2])
}
## Distance
## -0.008266139
## Distance
## -0.002676431
## Distance
## -0.004570243
## Distance
## -0.002754967
## Distance
## -0.02351894
## Distance
## -0.004070044